Homework5

Author

Becky Jin

Published

April 16, 2024

options(repos = c(CRAN = "https://cloud.r-project.org"))
#| warning: false
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at /Users/apple/Desktop/ling343-files/homework-5-beckyjyn
install.packages("knitr")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
install.packages("kableExtra")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
library(knitr)
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
library(tibble)
install.packages("gt")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
library(gt)
install.packages("ggplot2")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
install.packages("plotly")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
library(ggplot2)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Importing Data

here::i_am("analysis/homework5-beckyjyn.qmd")
here() starts at /Users/apple/Desktop/ling343-files/homework-5-beckyjyn
df_all <- read.csv(here("data/delong maze 40Ss.csv"), 
              header = 1, sep = ",", comment.char = "#", strip.white = T,
              col.names = c("Index","Time","Counter","Hash","Owner","Controller","Item","Element","Type","Group","FieldName","Value","WordNum","Word","Alt","WordOn","CorrWord","RT","Sent","TotalTime","Question","Resp","Acc","RespRT"))

Extract Reaction time

df_rt <- df_all |> 
  filter(Controller == "Maze" & !str_detect(Type, "prac")) |> 
  select(1:10, 13:20) |> 
  separate(col = Type, 
           into = c("exp", "item", "expect", "position", "pos", 
                    "cloze", "art.cloze", "n.cloze"), 
           sep = "\\.", convert = TRUE, fill = "right") |> 
  mutate(WordNum = as.numeric(WordNum),
         Acc = as.numeric(as.character(recode(CorrWord, yes = "1", no = "0"))),
         n.cloze.scale =  scale(n.cloze), 
         art.cloze.scale = scale(art.cloze)) |> 
  mutate(across(where(is.character), as.factor)) |> 
  filter(item != 29) |> 
  filter(Hash != "9dAvrH0+R6a0U5adPzZSyA")
rt.s <- df_rt 
rt.s$rgn.fix <- rt.s$WordNum - rt.s$pos + 1
rt.s$word.num.z <- scale(rt.s$WordNum)
rt.s$word.len <- nchar(as.character(rt.s$Word))
rt.s$Altword.len <- nchar(as.character(rt.s$Alt))
# simplying by using dummy/treatment coding instead of sum coding
# 'expected' will be reference level
#contrasts(rt.s$expect) <- c(-.5,.5)

rt.s$item.expect <- paste(rt.s$item, rt.s$expect, sep=".")
rt.s.filt <- rt.s[rt.s$Hash != "gyxidIf0fqXBM7nxg2K7SQ" & rt.s$Hash != "f8dC3CkleTBP9lUufzUOyQ",]

rgn.rt.raw <- rt.s.filt %>%
  filter(rgn.fix > -4 & rgn.fix < 5) %>%
  filter(Acc == 1) %>%
  group_by(rgn.fix, expect) %>%
  summarize(n = n(), 
            subj = length(unique(Hash)), 
            rt = mean(RT), 
            sd = sd(RT), 
            stderr = sd / sqrt(subj)) %>%
  as.data.frame()
`summarise()` has grouped output by 'rgn.fix'. You can override using the
`.groups` argument.
rgn.rt.raw$rgn <- as.factor(recode(rgn.rt.raw$rgn.fix, "-3"="CW-3", "-2"="CW-2", "-1"="CW-1", "0"="art", "1"="n","2"="CW+1", "3"="CW+2", "4"="CW+3"))
rgn.rt.raw$rgn <- ordered(rgn.rt.raw$rgn, levels = c("CW-3", "CW-2", "CW-1", "art", "n", "CW+1", "CW+2", "CW+3"))
rt.s.filt %>% 
  filter(rgn.fix == 0) %>% 
  group_by(Hash, expect) %>% 
  summarize(RT = mean(RT, na.rm = TRUE)) %>% 
  group_by(expect) %>% 
  summarize(RT = mean(RT, na.rm = TRUE)) %>% 
  gt() %>% 
  fmt_number(decimals = 2)
`summarise()` has grouped output by 'Hash'. You can override using the
`.groups` argument.
expect RT
expected 672.12
unexpected 716.56
set.seed(343)
p_parts <- rt.s.filt |> 
  filter(rgn.fix == 0) |> 
  group_by(Hash, expect) |> 
  summarize(RT = mean(RT, na.rm = TRUE)) |> 
  ggplot(aes(x=expect, y=RT, color = expect)) +
  geom_jitter(stat = "identity", width = .1, alpha = .8) +
  geom_point(stat = "summary", fun = mean, 
             shape = 4, color = "blue", size = 4) +
  labs(x = "Condition", y = "Reading Time (msec)") + 
  theme_bw()  
`summarise()` has grouped output by 'Hash'. You can override using the
`.groups` argument.
ggplotly(p_parts)

Describe the study

This study concentrates on investigating how English speakers predict following phonological forms based on grammatical and phonetic cues, using the choice between articles “a” and “an” as an instance. Whether “a” or “an” is required depends on the initial sound of the word that follows the article. Usually, words that begin a vowel sound would need “an” while the ones that begin with a consonant would rather need “a”.

Data Dictionary

These are the variables in the raw data.

df_all_col_names <- colnames(df_all)
not_na_counts <- sapply(df_all, function(x) sum(!is.na(x)))
data_dictionary <- tibble(Variable = df_all_col_names, Valid_cnts = not_na_counts)
print(data_dictionary)
# A tibble: 24 × 2
   Variable   Valid_cnts
   <chr>           <int>
 1 Index           73632
 2 Time            73632
 3 Counter         73632
 4 Hash            73632
 5 Owner           73632
 6 Controller      73632
 7 Item            73632
 8 Element         73632
 9 Type            73632
10 Group           73632
# ℹ 14 more rows
variable_explained <- tribble(
  ~Essential_Variable, ~Description,
  "Hash", "hash key of each participant",
  "Owner", "whether the participant is experiment owner or not",
  "Controller", "{Form|Maze|Question}",
  "Item", "sentence item index",
  "Type", "type of the given task",
  "Group", "group of the given stimuli",
  "Word", "original word in sentence",
  "Alt", "word alternative to the original word",
  "WordOn", "boolean value that determines the word is which of the above two types",
  "CorrWord", "if the word is correct or incorrect",
  "RT", "response time",
  "Sent", "the complete sentence",
  "TotalTime", "time taken for a correct response",
  "Question", "comprehension question", 
  "Resp", "participant's response to the above question",
  "Acc", "reponse is correctly accepted or not",
  "RespRT", "reaction time of the response"
)
print(variable_explained)
# A tibble: 17 × 2
   Essential_Variable Description                                               
   <chr>              <chr>                                                     
 1 Hash               hash key of each participant                              
 2 Owner              whether the participant is experiment owner or not        
 3 Controller         {Form|Maze|Question}                                      
 4 Item               sentence item index                                       
 5 Type               type of the given task                                    
 6 Group              group of the given stimuli                                
 7 Word               original word in sentence                                 
 8 Alt                word alternative to the original word                     
 9 WordOn             boolean value that determines the word is which of the ab…
10 CorrWord           if the word is correct or incorrect                       
11 RT                 response time                                             
12 Sent               the complete sentence                                     
13 TotalTime          time taken for a correct response                         
14 Question           comprehension question                                    
15 Resp               participant's response to the above question              
16 Acc                reponse is correctly accepted or not                      
17 RespRT             reaction time of the response                             
participant_info <- tribble(
  ~Abbrev, ~Full,
  "age", "-",
  "natlang", "Native language learned",
  "state", "-",
  "parentlang", "Language used by their parents",
  "domlang", "Dominant language", 
  "otherlang", "Other language learned",
  "gender", "-",
  "question- ", "..."
)
print(participant_info)
# A tibble: 8 × 2
  Abbrev       Full                          
  <chr>        <chr>                         
1 "age"        -                             
2 "natlang"    Native language learned       
3 "state"      -                             
4 "parentlang" Language used by their parents
5 "domlang"    Dominant language             
6 "otherlang"  Other language learned        
7 "gender"     -                             
8 "question- " ...                           

How many participants does the data have data in total?

Use inline R-code to “print” this number in the markdown text.

participants_num <- length(unique(df_all$Hash))
participants_num
[1] 39

Concerning participants’ age stats

df_all$Age <- as.numeric(as.character(df_all$Value), na.rm = TRUE)
Warning: NAs introduced by coercion
participants_age <- df_all %>% 
  summarize(
    Mean_Age = mean(Age, na.rm = TRUE),
    Min_Age = min(Age, na.rm = TRUE),
    Max_Age = max(Age, na.rm = TRUE),
    Sd_Age = sd(Age, na.rm = TRUE)
  )
print(participants_age)
  Mean_Age Min_Age Max_Age   Sd_Age
1 34.87179      18      71 14.08093

Reproduce the figure in Figure 1

df_raw <- rgn.rt.raw
df_raw %>% spread(rgn, rt)
   rgn.fix     expect subj        sd    stderr     CW-3     CW-2     CW-1
1       -3   expected   36  285.8839  47.64731 753.1880       NA       NA
2       -3 unexpected   36  306.2786  51.04644 757.1725       NA       NA
3       -2   expected   36  309.4916  51.58193       NA 733.0036       NA
4       -2 unexpected   36  411.7028  68.61713       NA 742.7986       NA
5       -1   expected   36  323.5000  53.91667       NA       NA 760.2436
6       -1 unexpected   36  272.0673  45.34454       NA       NA 751.1713
7        0   expected   36  224.8922  37.48203       NA       NA       NA
8        0 unexpected   36  339.1033  56.51721       NA       NA       NA
9        1   expected   36  255.3778  42.56296       NA       NA       NA
10       1 unexpected   36  542.2348  90.37247       NA       NA       NA
11       2   expected   36  327.1993  54.53322       NA       NA       NA
12       2 unexpected   36 1168.4236 194.73727       NA       NA       NA
13       3   expected   36  324.7641  54.12735       NA       NA       NA
14       3 unexpected   36  389.0720  64.84534       NA       NA       NA
15       4   expected   36  296.2991  49.38319       NA       NA       NA
16       4 unexpected   36  297.0947  49.51578       NA       NA       NA
        art        n     CW+1     CW+2     CW+3
1        NA       NA       NA       NA       NA
2        NA       NA       NA       NA       NA
3        NA       NA       NA       NA       NA
4        NA       NA       NA       NA       NA
5        NA       NA       NA       NA       NA
6        NA       NA       NA       NA       NA
7  674.1306       NA       NA       NA       NA
8  719.3884       NA       NA       NA       NA
9        NA  704.219       NA       NA       NA
10       NA 1061.635       NA       NA       NA
11       NA       NA 781.0073       NA       NA
12       NA       NA 859.8654       NA       NA
13       NA       NA       NA 785.0631       NA
14       NA       NA       NA 793.6538       NA
15       NA       NA       NA       NA 766.7514
16       NA       NA       NA       NA 789.8782
df_raw$rgn.fix <- NULL
df_raw %>% 
  mutate(condition = expect)
       expect    n subj        rt        sd    stderr  rgn  condition
1    expected 1383   36  753.1880  285.8839  47.64731 CW-3   expected
2  unexpected 1380   36  757.1725  306.2786  51.04644 CW-3 unexpected
3    expected 1376   36  733.0036  309.4916  51.58193 CW-2   expected
4  unexpected 1380   36  742.7986  411.7028  68.61713 CW-2 unexpected
5    expected 1375   36  760.2436  323.5000  53.91667 CW-1   expected
6  unexpected 1389   36  751.1713  272.0673  45.34454 CW-1 unexpected
7    expected 1363   36  674.1306  224.8922  37.48203  art   expected
8  unexpected 1385   36  719.3884  339.1033  56.51721  art unexpected
9    expected 1397   36  704.2190  255.3778  42.56296    n   expected
10 unexpected 1388   36 1061.6347  542.2348  90.37247    n unexpected
11   expected 1377   36  781.0073  327.1993  54.53322 CW+1   expected
12 unexpected 1389   36  859.8654 1168.4236 194.73727 CW+1 unexpected
13   expected 1363   36  785.0631  324.7641  54.12735 CW+2   expected
14 unexpected 1349   36  793.6538  389.0720  64.84534 CW+2 unexpected
15   expected 1215   36  766.7514  296.2991  49.38319 CW+3   expected
16 unexpected 1215   36  789.8782  297.0947  49.51578 CW+3 unexpected
df_raw$expect <- NULL
df_raw
      n subj        rt        sd    stderr  rgn
1  1383   36  753.1880  285.8839  47.64731 CW-3
2  1380   36  757.1725  306.2786  51.04644 CW-3
3  1376   36  733.0036  309.4916  51.58193 CW-2
4  1380   36  742.7986  411.7028  68.61713 CW-2
5  1375   36  760.2436  323.5000  53.91667 CW-1
6  1389   36  751.1713  272.0673  45.34454 CW-1
7  1363   36  674.1306  224.8922  37.48203  art
8  1385   36  719.3884  339.1033  56.51721  art
9  1397   36  704.2190  255.3778  42.56296    n
10 1388   36 1061.6347  542.2348  90.37247    n
11 1377   36  781.0073  327.1993  54.53322 CW+1
12 1389   36  859.8654 1168.4236 194.73727 CW+1
13 1363   36  785.0631  324.7641  54.12735 CW+2
14 1349   36  793.6538  389.0720  64.84534 CW+2
15 1215   36  766.7514  296.2991  49.38319 CW+3
16 1215   36  789.8782  297.0947  49.51578 CW+3
rt.s.filt |> 
  filter(rgn.fix == 0) |> 
  group_by(Hash, expect) |> 
  summarize(RT = mean(RT, na.rm = TRUE)) |> 
  ggplot(aes(x=expect, y=RT, shape = expect, group = Hash, color = Hash)) +
  geom_line() +
  geom_point(stat = "identity", alpha = .8, size = 2) +
  labs(x = "Condition", y = "Reading Time (msec)") + 
  theme_minimal() +
  theme(legend.position = "none") 
`summarise()` has grouped output by 'Hash'. You can override using the
`.groups` argument.

# package for linear mixed effects
library(lme4)
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Warning in check_dep_version(): ABI version mismatch: 
lme4 was built with Matrix ABI version 1
Current Matrix ABI version is 0
Please re-install lme4 from source or restore original 'Matrix' package
 # package for p-values from lme4 models
library(lmerTest)

Attaching package: 'lmerTest'
The following object is masked from 'package:lme4':

    lmer
The following object is masked from 'package:stats':

    step
m_lm <- lm(RT ~ expect, 
           data = filter(rt.s.filt, rgn.fix == 0))
summary(m_lm)

Call:
lm(formula = RT ~ expect, data = filter(rt.s.filt, rgn.fix == 
    0))

Residuals:
   Min     1Q Median     3Q    Max 
-603.8 -159.8  -66.9   62.4 3964.2 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       672.017      7.675  87.557  < 2e-16 ***
expectunexpected   44.803     10.847   4.131 3.72e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 289.2 on 2842 degrees of freedom
Multiple R-squared:  0.005968,  Adjusted R-squared:  0.005618 
F-statistic: 17.06 on 1 and 2842 DF,  p-value: 3.723e-05
install.packages("gtsummary")

The downloaded binary packages are in
    /var/folders/y4/2c8ll6k92nd21739y4jnkk7w0000gn/T//Rtmp8TzD6f/downloaded_packages
library(gtsummary)
tbl_regression(m_lm, conf.int = FALSE)
Characteristic Beta p-value
expect

    expected
    unexpected 45 <0.001
#m_mixed <- lmer(RT ~ expect + (1 | Hash) + (1 + expect | item), 
           #data = filter(rt.s.filt, rgn.fix == 0))
#summary(m_mixed)
#tbl_regression(m_mixed, conf.int = FALSE)
#m_noun <- lmer(RT ~ expect + (1 | Hash) + (1 + expect | item), 
           #data = filter(rt.s.filt, rgn.fix == 1))
#tbl_regression(m_noun, conf.int = FALSE)